POSTAL_TO_STATE = list('AL'='Alabama', 'AK'='Alaska', 'AS'='American Samoa',
'AZ'='Arizona', 'AR'='Arkansas', 'CA'='California',
'CO'='Colorado', 'CT'='Connecticut', 'DE'='Delaware',
'DC'='District of Columbia', 'FL'='Florida',
'GA'='Georgia', 'GU'='Guam', 'HI'='Hawaii',
'ID'='Idaho', 'IL'='Illinois', 'IN'='Indiana',
'IA'='Iowa', 'KS'='Kansas', 'KY'='Kentucky',
'LA'='Louisiana', 'ME'='Maine', 'MD'='Maryland',
'MA'='Massachusetts', 'MI'='Michigan', 'MN'='Minnesota',
'MS'='Mississippi', 'MO'='Missouri', 'MT'='Montana',
'NE'='Nebraska', 'NV'='Nevada', 'NH'='New Hampshire',
'NJ'='New Jersey', 'NM'='New Mexico', 'NY'='New York',
'NC'='North Carolina', 'ND'='North Dakota',
'MP'='Northern Mariana Islands', 'OH'='Ohio',
'OK'='Oklahoma', 'OR'='Oregon', 'PA'='Pennsylvania',
'PR'='Puerto Rico', 'RI'='Rhode Island', 'SC'='South Carolina',
'SD'='South Dakota', 'TN'='Tennessee',
'TX'='Texas', 'UT'='Utah', 'VT'='Vermont', 'VI'='Virgin Islands',
'VA'='Virginia', 'WA'='Washington', 'WV'='West Virginia',
'WI'='Wisconsin', 'WY'='Wyoming')
states = c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", "hi",
"id", "il", "in", "ia", "ks", "ky", "la", "me", "md", "ma", "mi",
"mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", "nm", "ny", "nc",
"nd", "oh", "ok", "or", "pa", "ri", "sc", "sd", "tn", "tx", "ut",
"vt", "va", "wa", "wv", "wi", "wy")
BASE_DAILY_URL = paste0(
'https://raw.githubusercontent.com/google-research/open-covid-19-data/',
'master/data/exports/search_trends_symptoms_dataset/',
'United%20States%20of%20America/subregions/{state}/',
'2020_US_{state_underscore}_daily_symptoms_dataset.csv')
cache_data_list = list()
signal_description_df = tribble(
~signal, ~description,
'Podalgia', 'pain in the foot',
'Anosmia', 'loss of smell',
'Purpura', "red/purple skin spots; 'blood spots'",
'Radiculopathy', 'pinched nerve',
'Ageusia', 'loss of taste',
'Erythema chronicum migrans', 'expanding rash early in lyme disease',
'Photodermatitis', 'allergic rash that reqs light',
)
expand_state_name = function(state) {
state_name = POSTAL_TO_STATE[[str_to_upper(state)]]
return(state_name)
}
load_state_data = function(state) {
if (state %in% names(cache_data_list)) return (cache_data_list[[state]])
# Check whether there is a cached version
state_fname = sprintf('cache/%s.csv', state)
# if there isn't, then download
if (!file.exists(state_fname)) {
state_name = expand_state_name(state)
message(sprintf('Downloading data for %s...', state_name))
state_name_underscore = str_replace_all(state_name, ' ', '_')
STATE_DAILY_URL = str_replace_all(BASE_DAILY_URL,
fixed('{state}'), state_name)
STATE_DAILY_URL = str_replace_all(STATE_DAILY_URL,
fixed('{state_underscore}'),
state_name_underscore)
STATE_DAILY_URL = str_replace_all(STATE_DAILY_URL,
fixed(' '),
'%20')
download.file(STATE_DAILY_URL, state_fname)
}
single_state = readr::read_csv(state_fname)
cache_data_list[[state]] <<- single_state
return (single_state)
}
pull_data_state = function(state, symptom) {
single_state = load_state_data(state)
unique(single_state$sub_region_2_code)
single_state_counties = single_state[!is.na(single_state$sub_region_2_code),]
selected_symptom = paste0('symptom:', symptom)
single_state_symptom = single_state_counties[,c('sub_region_2_code',
'date',
selected_symptom)]
# Shape into what we want
colnames(single_state_symptom) = c('geo_value', 'time_value', 'value')
single_state_symptom = single_state_symptom %>% filter (
!is.na(value),
)
single_state_symptom = single_state_symptom %>% transmute (
geo_value = sprintf('%05d', as.numeric(geo_value)),
signal = symptom,
time_value = time_value,
direction = NA,
issue = lubridate::today(),
lag = issue - time_value,
value = value,
stderr = NA,
sample_size = NA,
data_source = 'google_symptoms',
)
}
if (file.exists('symptom_df.RDS')) {
symptom_df = readRDS('symptom_df.RDS')
symptom_names = unique(symptom_df$signal)
} else {
dir.create('./cache/')
ak = load_state_data('ak')
symptom_cols = colnames(ak)[
str_detect(colnames(ak), 'symptom:')]
symptom_names = str_replace(symptom_cols, fixed('symptom:'), '')
symptom_df_list = vector('list', length(symptom_names))
names(symptom_df_list) = symptom_names
for (symptom in symptom_names) {
cat(symptom, '...\n')
states_list = vector('list', length(states))
for (idx in 1:length(states)) {
state = states[idx]
states_list[[idx]] = pull_data_state(state, symptom)
}
symptom_df_list[[symptom]] = bind_rows(states_list)
}
symptom_df = bind_rows(symptom_df_list)
saveRDS(symptom_df, 'symptom_df.RDS')
}
start_day = "2020-03-01"
end_day = "2020-08-15"
df_inum = covidcast_signal(data_source = "jhu-csse",
signal = "confirmed_7dav_incidence_num",
start_day = start_day, end_day = end_day)
case_num = 500
geo_values = df_inum %>% group_by(geo_value) %>%
summarize(total = sum(value)) %>%
filter(total >= case_num) %>% pull(geo_value)
df_inum_act = df_inum %>% filter(geo_value %in% geo_values)
symptom_df_act = symptom_df %>% filter (
geo_value %in% geo_values,
)
Here we plot the availaibility of each symptom over time (proportion is percentage of counties for which the symptom was available). We see that for each signal, the availability level is consistent over time, subject to a strong weekend effect.
availability_df = symptom_df_act %>% group_by (
time_value,
signal,
) %>% summarize (
prop_available = n() / length(geo_values),
) %>% ungroup (
)
## `summarise()` regrouping output by 'time_value' (override with `.groups` argument)
plt = (ggplot(availability_df)
+ geom_line(aes(x=time_value,
y=prop_available,
group=factor(signal)),
color='dodgerblue4',
size=0.1)
)
plt
The symptoms for which data is most sparse are:
most_missing = availability_df %>% group_by (
signal,
) %>% summarize (
avg_available = mean(prop_available)
) %>% ungroup (
) %>% filter (
avg_available <= 0.05
) %>% arrange (
avg_available,
)
## `summarise()` ungrouping output (override with `.groups` argument)
print(most_missing)
## # A tibble: 16 x 2
## signal avg_available
## <chr> <dbl>
## 1 Viral pneumonia 0.0279
## 2 Aphonia 0.0334
## 3 Crackles 0.0353
## 4 Burning Chest Pain 0.0363
## 5 Urinary urgency 0.0369
## 6 Polydipsia 0.0371
## 7 Photodermatitis 0.0380
## 8 Dysautonomia 0.0414
## 9 Shallow breathing 0.0414
## 10 Allergic conjunctivitis 0.0435
## 11 Laryngitis 0.0437
## 12 Ventricular fibrillation 0.0455
## 13 Angular cheilitis 0.0462
## 14 Myoclonus 0.0462
## 15 Rectal pain 0.0489
## 16 Stridor 0.0491
For the signal that is most sparsely available, the number of counties at which it tends to be available daily is:
print(min(most_missing$avg_available) * length(geo_values))
## [1] 29.52653
Based on this, we leave all the symptoms in for the full correlations analysis.
cor_list = vector('list', length(symptom_names))
names(cor_list) = symptom_names
if (file.exists('cor_df.RDS')) {
cor_df = readRDS('cor_df.RDS')
} else {
for (symptom in symptom_names) {
cat(symptom, '...\n')
df_cor1 = covidcast_cor(symptom_df_act %>% filter(signal == symptom),
df_inum_act,
by = "time_value",
method = "spearman")
df_cor1['signal'] = symptom
cor_list[[symptom]] = df_cor1
}
cor_df = bind_rows(cor_list)
saveRDS(cor_df, 'cor_df.RDS')
}
cor_df = cor_df %>% left_join(
signal_description_df,
on='signal',
)
## Joining, by = "signal"
## Warning: Removed 7174 row(s) containing missing values (geom_path).
When we discuss the “size” of a correlation, we consider the absolute value of correlation.
top_cor_signals = plot_cor_df %>% group_by (
signal,
) %>% filter (
abs(value) == max(abs(value), na.rm=TRUE),
) %>% ungroup (
) %>% arrange(
-abs(value),
) %>% head (
5,
)
top_cor_sum_stats = plot_cor_df %>% filter (
signal %in% top_cor_signals$signal,
) %>% group_by (
signal,
) %>% summarize (
min = min(value, na.rm=TRUE),
quart1 = quantile(value, 0.25, na.rm=TRUE),
med = median(value, na.rm=TRUE),
mean = mean(value, na.rm=TRUE),
quart3 = quantile(value, 0.75, na.rm=TRUE),
max = max(value, na.rm=TRUE),
) %>% ungroup (
)
## `summarise()` ungrouping output (override with `.groups` argument)
print('Symptoms with the largest all-time correlation:')
## [1] "Symptoms with the largest all-time correlation:"
print(top_cor_signals %>% left_join(top_cor_sum_stats, on='signal')
%>% select(-time_value, -value),
width=100)
## Joining, by = "signal"
## # A tibble: 5 x 8
## signal description min quart1
## <chr> <chr> <dbl> <dbl>
## 1 Viral pneumonia <NA> -0.898 -0.269
## 2 Anosmia loss of smell -0.0788 0.403
## 3 Ageusia loss of taste -0.423 0.260
## 4 Erythema chronicum migrans expanding rash early in lyme disease -0.710 -0.556
## 5 Photodermatitis allergic rash that reqs light -0.709 -0.487
## med mean quart3 max
## <dbl> <dbl> <dbl> <dbl>
## 1 0.0853 0.0216 0.327 0.668
## 2 0.610 0.545 0.725 0.833
## 3 0.552 0.450 0.682 0.797
## 4 -0.276 -0.325 -0.179 0.208
## 5 -0.366 -0.349 -0.211 0.184
plt = (ggplot(plot_cor_df)
+ geom_line(aes(x=time_value,
y=value,
group=factor(signal)),
data=plot_cor_df %>% filter (
!signal %in% top_cor_signals$signal
),
color='cornsilk',
size=0.1,
alpha=1.0)
+ geom_line(aes(x=time_value,
y=value,
group=factor(signal),
colour=factor(signal)
),
data=plot_cor_df %>% filter (
signal %in% top_cor_signals$signal,
),
#color='darkorange',
size=0.3)
+ ylab('rank correlation')
+ scale_x_date(breaks=lubridate::ymd(c('2020-03-01',
'2020-03-15', '2020-04-01', '2020-04-15', '2020-05-01',
'2020-05-15', '2020-06-01', '2020-06-15', '2020-07-01',
'2020-07-15', '2020-08-01', '2020-08-15')))
+ theme(axis.text.x = element_text(angle = 45))
+ ggtitle("Top 5 signals by all-time max(|corr|)")
)
plt
## Warning: Removed 7089 row(s) containing missing values (geom_path).
## Warning: Removed 85 row(s) containing missing values (geom_path).
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Symptoms that consistently stay away from 0 correlation:"
## Joining, by = "signal"
## # A tibble: 5 x 8
## signal description min quart1
## <chr> <chr> <dbl> <dbl>
## 1 Podalgia pain in the foot -0.423 -0.279
## 2 Restless legs syndrome <NA> -0.528 -0.410
## 3 Anosmia loss of smell -0.0788 0.403
## 4 Purpura red/purple skin spots; 'blood spots' -0.533 -0.366
## 5 Radiculopathy pinched nerve -0.477 -0.349
## med mean quart3 max
## <dbl> <dbl> <dbl> <dbl>
## 1 -0.217 -0.223 -0.166 -0.0710
## 2 -0.337 -0.310 -0.215 -0.0530
## 3 0.610 0.545 0.725 0.833
## 4 -0.283 -0.289 -0.206 -0.0477
## 5 -0.278 -0.256 -0.154 -0.0425
## Warning: Removed 7089 row(s) containing missing values (geom_path).
## Warning: Removed 85 row(s) containing missing values (geom_path).
## [1] "Podalgia" "Restless legs syndrome" "Anosmia"
## [4] "Purpura" "Radiculopathy"
## [1] "Anosmia"
Interestingly, although anosmia technically is one of the “most stable” by our criterion, its distribution actually covers zero.
if (file.exists('geo_cor_df.RDS')) {
geo_cor_df = readRDS('geo_cor_df.RDS')
} else {
geo_cor_list = vector('list', length(symptom_names))
names(geo_cor_list) = symptom_names
for (symptom in symptom_names) {
cat(symptom, '...\n')
df_cor1 = covidcast_cor(symptom_df_act %>% filter(signal == symptom),
df_inum_act,
by = "geo_value",
method = "spearman")
df_cor1['signal'] = symptom
geo_cor_list[[symptom]] = df_cor1
}
geo_cor_df = bind_rows(geo_cor_list)
saveRDS(geo_cor_df, 'geo_cor_df.RDS')
}
geo_cor_df = geo_cor_df %>% left_join(
signal_description_df,
on='signal',
)
## Joining, by = "signal"
for (symptom in top_cor_signals$signal) {
df_cor2 = geo_cor_df %>% filter (signal == symptom)
df_cor2$time_value = min_available_time
df_cor2$issue = min_available_time
attributes(df_cor2)$geo_type = 'county'
class(df_cor2) = c("covidcast_signal", "data.frame")
n_available_county = df_cor2 %>% filter (!is.na(value)) %>% nrow()
# Plot choropleth maps, using the covidcast plotting functionality
title_text = sprintf("Correlations between cases and %s (%d counties)",
symptom, n_available_county)
if (!is.na(df_cor2$description[1])) {
title_text = paste0(title_text, '\n', sprintf('(%s)', df_cor2$description[1]))
}
print(plot(df_cor2,
title = title_text,
range = c(-1, 1), choro_col = c("orange","lightblue", "purple")))
}
for (symptom in top_min_cor$signal) {
df_cor2 = geo_cor_df %>% filter (signal == symptom)
df_cor2$time_value = min_available_time
df_cor2$issue = min_available_time
attributes(df_cor2)$geo_type = 'county'
class(df_cor2) = c("covidcast_signal", "data.frame")
n_available_county = df_cor2 %>% filter (!is.na(value)) %>% nrow()
# Plot choropleth maps, using the covidcast plotting functionality
title_text = sprintf("Correlations between cases and %s (%d counties)",
symptom, n_available_county)
if (!is.na(df_cor2$description[1])) {
title_text = paste0(title_text, '\n', sprintf('(%s)', df_cor2$description[1]))
}
print(plot(df_cor2,
title = title_text,
range = c(-1, 1), choro_col = c("orange","lightblue", "purple")))
}
TODO:
Some things I’m still worried about / next steps: